# 文本分类RNN
import tensorflow as tf
import numpy as np
import re
from tensorflow.contrib import learn
from tensorflow.contrib.layers import fully_connected
import os

# os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'
np.random.seed(777)
tf.set_random_seed(777)

#对文本进行清洗处理
def clean_str(string): #去掉文本中的无效字符并切分单词
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)  #去掉A-Za-z0-9(),!?\
    string = re.sub(r"\'s", " \'s", string)   #将's，转化成' s'
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)  #大于2个空白字符
    return string.strip().lower()   #去掉两边空格,并变成小写

#定义训练超参数
learning_rate = 0.001  #学习率
training_epochs = 10 #训练总周期

dev_sample_percentage = 0.1 #测试集的比例
# positive_data_file ='rt-polarity.pos'  #正面评价文本路径
# negative_data_file ='rt-polarity.neg'  #负面评价文本路径
path = r'..\..\..\..\..\large_data\DL1\cnn_text\\'
is_test = False
if is_test:
    xname = '2'
else:
    xname = ''
positive_data_file = path + 'rt-polarity' + xname + '.pos'  #正面评价文本
negative_data_file = path + 'rt-polarity' + xname + '.neg'  #负面评价文本

#读取正面评价文件，进行适当处理
positive_examples = list(open(positive_data_file, "r", encoding='gbk',errors='ignore').readlines())  #读取正面评价文件，多行
positive_examples = [s.strip() for s in positive_examples]   #去掉每个句子的两边空格
# print(positive_examples) #正面评价文本（临时测试用）

#读取负面评价文件，进行适当处理
negative_examples = list(open(negative_data_file, "r", encoding='gbk',errors='ignore').readlines()) #读取负面评价文件，多行
negative_examples = [s.strip() for s in negative_examples] #去掉每个句子的两边空格
# print(negative_examples) #负面评价文本（临时测试用）

#拼接正负文本,清洗文件
x_text = positive_examples + negative_examples  #把所有正面和负面文本拼接起来，axis=0
x_text = [clean_str(sent) for sent in x_text]   #去掉无效字符并切分单词

#生成标签，对标签独热处理
positive_labels = [[0, 1] for _ in positive_examples] # 标签Y独热编码，正面[0,1]
negative_labels = [[1, 0] for _ in negative_examples] # 标签Y独热编码，负面[1,0]
y_data = np.concatenate([positive_labels, negative_labels], 0)  #连接所有标签,axis=0
# print(x_text) #经过处理的文本（临时测试用）
# print(y_data) #标签（临时测试用）

with open('x_sentences.txt', 'w') as f:
    x_sentences_with_n = [sent + '\n' for sent in x_text]
    f.writelines(x_sentences_with_n)
np.savetxt('y.np.txt', y_data)

#建立词汇表
max_document_length = max([len(x.split(" ")) for x in x_text])  #返回一个句子中最大的单词数:56, x=每一个句子
print('一个句子最大的单词数：', max_document_length)  #（临时测试用）

#把所有的单词重新编码：
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)   #建立单词表对象，其中每个单词对应一个编号
x = np.array(list(vocab_processor.fit_transform(x_text)))    # x.shape=(10662,56)
print('单词总数：', len(vocab_processor.vocabulary_)) #（临时测试用）
print('句子的编码', x) #（临时测试用）

# [[    1     2     3 ...     0     0     0]  # 0表示填充， 非0表示单词在词典中的编号，每一行代表一个句子，最长56个单词(列)
#  [    1    31    32 ...     0     0     0]
#  [   57    58    59 ...     0     0     0]
#  ...
#  [   75    84  1949 ...     0     0     0]
#  [    1  2191  2690 ...     0     0     0]
#  [11513     3   147 ...     0     0     0]]

## Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping

## Sort the vocabulary dictionary on the basis of values(id).
## Both statements perform same task.
#sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1))
sorted_vocab = sorted(vocab_dict.items(), key = lambda x : x[1])

## Treat the id's as index into list and create a list of words in the ascending order of id's
## word with id i goes at index i of the list.
vocabulary = list(list(zip(*sorted_vocab))[0])
import pandas as pd
df = pd.DataFrame({0: vocabulary})
# df.sort_values(0).to_csv('vocab.txt', index=False)

# 对数据集进行洗牌
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y_data)))  #生成len(y_data)的随机序列
x_shuffled = x[shuffle_indices]
y_shuffled = y_data[shuffle_indices]

#切分训练集和测试集
dev_sample_index = -1 * int(dev_sample_percentage * float(len(y_data)))   #测试集的样本个数
x_train, x_dev = np.split(x_shuffled, [dev_sample_index,])  #训练和测试集样本的X
y_train, y_dev = np.split(y_shuffled, [dev_sample_index,])  #训练和测试集样本的y

# np.savetxt('x_train.txt', x_train)
# np.savetxt('y_train.txt', y_train)

total = x_train.shape[0]    #训练集样本总数
sequence_length = x_train.shape[1]   # 序列长度，句子的最大长度 56
print('训练集', x_train.shape, '(句子数,每个句子的最大单词数)') #（临时测试用）
print('测试集：', x_dev.shape) #（临时测试用）

g_b=0
# 自己实现next_batch函数，每次返回一批数据
def next_batch(size):
    global g_b
    xb = x_train[g_b:g_b+size]
    yb = y_train[g_b:g_b+size]
    g_b = g_b + size
    return xb,yb

#定义rnn单元参数
n_neurons = 128  #隐藏状态，神经元个数
n_outputs = 2    #输出2分类
n_layers = 2     #堆叠层数

#定义rnn网络参数
embedding_size = n_neurons # 词向量的维度
n_steps = max_document_length #时间步数（序列长度）
n_inputs = embedding_size #输入数据长度

batch_size = 64 #训练的每批样本

# 定义占位符
X = tf.placeholder(tf.int32, [None, max_document_length])
Y = tf.placeholder(tf.int32, [None, 2]) # 独热编码

# 加入嵌入层
W = tf.Variable(tf.random_uniform([len(vocab_processor.vocabulary_), embedding_size], -1.0, 1.0))  #(10662,8)
X_data = tf.nn.embedding_lookup(W, X)  # (?, 56, 128)  #X中每一个单词通过embedding_lookup，从W中查询相应编号对应的词向量
# print(X_data) #（临时测试用）

#定义rnn单元的基本模型
# cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons) for layer in range(n_layers)]
# cells = [tf.contrib.rnn.GRUCell(num_units=n_neurons) for layer in range(n_layers)]
cells = [tf.contrib.rnn.LSTMCell(num_units=n_neurons) for layer in range(n_layers)]#定义堆叠LSTM单元
multi_cell = tf.contrib.rnn.MultiRNNCell(cells)   #堆叠多层LSTM单元
outputs, states = tf.nn.dynamic_rnn(multi_cell, X_data, dtype=tf.float32) #outputs(?, 56, 128)
print(outputs.shape)
# top_layer_h_state = states[-1][1] #最顶层隐藏层状态（最后）  (?,128)
# logits = tf.layers.dense(outputs[:,-1], n_outputs, name="softmax") #用最后一个Cell的输出
logits = fully_connected(outputs[:,-1], n_outputs, activation_fn=None) #用最后一个Cell的输出

# 定义代价或损失函数
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
#定义Adam优化器
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # 优化器

# tf.summary.scalar("loss", cost)
# summary = tf.summary.merge_all()
# global_step = 0
# 创建会话
sess = tf.Session()
sess.run(tf.global_variables_initializer()) #全局变量初始化
# writer = tf.summary.FileWriter(TB_SUMMARY_DIR, sess.graph)
#计算准确率
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('开始学习...')
for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = int(np.ceil(total / batch_size))  #计算总批次
    group = int(np.ceil(total_batch / 10))
    g_b = 0
    for i in range(total_batch):
        batch_xs, batch_ys = next_batch(batch_size)  #分批次提取数据
        feed_dict = {X: batch_xs, Y: batch_ys}
        # s, c, _ = sess.run([summary, cost, optimizer], feed_dict=feed_dict)
        c, _, acc = sess.run([cost, optimizer, accuracy], feed_dict=feed_dict)  #执行训练，计算代价
        if i % group == 0:
            print(f'epoch#{epoch + 1}: batch#{i + 1}: cost = {c}, acc = {acc}')
        avg_cost += c / total_batch  #计算每个批次的平均代价
        # writer.add_summary(s, global_step=global_step)
        # global_step = global_step + 1
    if i % group != 0:
        print(f'epoch#{epoch + 1}: batch#{i + 1}: cost = {c}, acc = {acc}')

    acc = sess.run(accuracy, feed_dict={X: x_dev, Y: y_dev})
    print('Epoch:', (epoch + 1), 'cost =', avg_cost, 'acc=', acc)
print('学习完成')

# 测试模型检查准确率
print('Accuracy:', sess.run(accuracy, feed_dict={X: x_dev, Y: y_dev}))

'''
Epoch: 1 cost = 0.6905622550305104 acc= 0.6163227
Epoch: 2 cost = 0.5400942524007504 acc= 0.72232646
Epoch: 3 cost = 0.35755642148472305 acc= 0.7448405
Epoch: 4 cost = 0.2213559322209166 acc= 0.750469
Epoch: 5 cost = 0.14684994823780637 acc= 0.74108815
Epoch: 6 cost = 0.09073889089885778 acc= 0.74859285
Epoch: 7 cost = 0.07446735336964061 acc= 0.750469
Epoch: 8 cost = 0.049180956293309985 acc= 0.7307692
Epoch: 9 cost = 0.030468408543391504 acc= 0.7429643
Epoch: 10 cost = 0.02506378301084916 acc= 0.74108815
学习完成
Accuracy: 0.74108815(GRUCell)
0.74202627(LSTM)
0.48311445 BasicRNNCell
'''
